 ************************************************
** Build country-specific firm panel data set and construct TFP
************************************************

 
** ORBIS NUTS: draw data set and store only NUTS region and firm ID
 
 forvalues yyy=2016(1)2019{
use bvdid  NUTS* CLOSDATE_year if CLOSDATE_year  ==`yyy' using "$BvD_input/Orbis_NUTS_2016_2020.dta", clear
gen idnr = bvdid
bysort idnr CLOSDATE_year: gen xxx=_n
sum xxx, det
drop if xxx>1
rename *, lower

save  "$BvD_input/NUTS_`yyy'.dta", replace
 }
forvalues yyy=2015(1)2015{
use bvdid  NUTS* CLOSDATE_year if CLOSDATE_year  ==`yyy' using "$BvD_input/Orbis_NUTS_2015.dta", clear
gen idnr = bvdid
bysort idnr CLOSDATE_year: gen xxx=_n
sum xxx, det
drop if xxx>1
 rename *, lower
save  "$BvD_input/NUTS_`yyy'.dta", replace
 }
forvalues yyy=2010(1)2014{
use bvdid  NUTS* CLOSDATE_year if CLOSDATE_year  ==`yyy' using "$BvD_input/Orbis_NUTS_2010_2014.dta", clear
gen idnr = bvdid
bysort idnr CLOSDATE_year: gen xxx=_n
sum xxx, det
drop if xxx>1
 rename *, lower
save  "$BvD_input/NUTS_`yyy'.dta", replace
 }

*** GENERATE AMADEUS DATA SET: for size reasons, loop through 3 separate data sets
****** (suffixes denote the years of coverage: 2010-2014, 2015, and 2016-2019)



forvalues yyy=2010(1)2014{
use if CLOSDATE_year==`yyy' using "$BvD_input/RAW_FULL_Amadeus_Europe_2010_2014.dta", clear

rename *, lower

* construct NACE code (2 digit)
gen  snace_d2=substr(nace_prim_code,1,2)
destring snace_d2,gen(nace_d2)

* Keep only manufacturing industries
keep if nace_d2>=10  &  nace_d2<=32

* drop firms that having missing variables that construct TFP
keep if (emp > 0  & emp != .) & (staf > 0  & staf != .) & (av > 0 & av != .) & ((tfas !=.   & tfas >0)) 

gen log_av=ln(av)
gen log_empl=ln(empl)
gen log_tfas=ln(tfas)

save "$BvD_input/Amadeus_Europe_`yyy'_coded.dta", replace

}
 
forvalues yyy=2015(1)2015{
 use if CLOSDATE_year==`yyy' using "$BvD_input/RAW_FULL_Amadeus_Europe_2015.dta", clear
rename *, lower
gen  snace_d2=substr(nace_prim_code,1,2)
destring snace_d2,gen(nace_d2)
keep if nace_d2>=10  &  nace_d2<=32
keep if (emp > 0  & emp != .) & (staf > 0  & staf != .) & (av > 0 & av != .) & ((tfas !=.   & tfas >0)) 

gen log_av=ln(av)
gen log_empl=ln(empl)
gen log_tfas=ln(tfas)
save "$BvD_input/Amadeus_Europe_`yyy'_coded.dta", replace
}

forvalues yyy=2016(1)2019{
use if CLOSDATE_year==`yyy' using "$BvD_input/RAW_FULL_Amadeus_Europe_2016_2020.dta", clear
rename *, lower
gen  snace_d2=substr(nace_prim_code,1,2)
destring snace_d2,gen(nace_d2)
keep if nace_d2>=10  &  nace_d2<=32
keep if (emp > 0  & emp != .) & (staf > 0  & staf != .) & (av > 0 & av != .) & ((tfas !=.   & tfas >0)) 

gen log_av=ln(av)
gen log_empl=ln(empl)
 gen log_tfas=ln(tfas)
save "$BvD_input/Amadeus_Europe_`yyy'_coded.dta", replace
}
 

*** merge on NUTS variable year by year

forvalues yyy=2010(1)2019{
use "$BvD_input/Amadeus_Europe_`yyy'_coded.dta", clear
keep if nace_d2>=10  &  nace_d2<=32
* drop duplicate firms
bysort idnr: gen   xxx=_n
drop if xxx>1
drop xxx
merge 1:1 idnr closdate_year using "$BvD_input/NUTS_`yyy'.dta"
drop xxx
drop if _merge==2
rename _merge merge_`yyy' 
 save "$BvD_input/Amadeus_Europe_`yyy'_coded_NUTS.dta", replace
}

** append panel data set
use "$BvD_input/Amadeus_Europe_2010_coded_NUTS.dta", clear
forvalues yyy=2011(1)2019{
append using "$BvD_input/Amadeus_Europe_`yyy'_coded_NUTS.dta"
} 
save "$BvD_input/Amadeus_Europe_2010_2019_coded_NUTS.dta", replace


use "$BvD_input/Amadeus_Europe_2010_2019_coded_NUTS.dta", clear
** Prepare: generate latest obs year for each firm (will keep only the latest observation)
bysort idnr: egen latestyear=max(closdate_year)

  
** Fill in NUTS region, if missing, from previous years (for latest year, which we'll keep)
foreach var of varlist nuts2 nuts3 {
forvalues lll=1(1)10{
sort idnr closdate_year
by idnr: replace `var' = `var'[_n-`lll'] if  `var'=="" & `var'[_n-`lll'] !="" &  closdate_year==latestyear
}
}

** Keep latest observation per firm
keep if closdate_year==latestyear


keep if nuts2!="" | nuts3!="" 
gen oone=1 
bysort cntrycde: egen ccount=total(oone)
drop if ccount<100

drop if cntrycde==""
encode   cntrycde  ,gen(country)
sum country

local max = r(max)
disp `max'

save  "$BvD_input/Amadeus_Europe_2010_2019_coded_latest_NUTS.dta", replace

** Save country-specific data sets
use "$BvD_input/Amadeus_Europe_2010_2019_coded_latest_NUTS.dta", clear

forval i = 1/`max' {
  use "$BvD_input/Amadeus_Europe_2010_2019_coded_latest_NUTS.dta" if country == `i' , clear 

timer on 1
 
preserve
keep if closdate_year >= 2005 & closdate_year <= 2021
foreach var of varlist staf av  tfas {
	replace `var' = . if `var' <= 0
}

** Generate Cobb Douglas parameters: labor share for each NACE 2-digit
gen include = 0
replace include = 1 if staf > 0 & av > 0 & staf != . & av != . // include only firms for which we observe staf and av simultaneously
*construct total payroll and total value added by industry-year
foreach var of varlist staf av {
	bysort closdate_year nace_d2 include: gegen sum_`var' = sum(`var')
	replace sum_`var' = . if include == 0
}
* generate labor share by industry(-year)
** note: we call "alpha" the labor share
gen alpha_jt = sum_staf / sum_av
*dealing with extreme values : max value by sector
gen indicator_geq1 = 0 if include == 1
replace indicator_geq1 = 1 if alpha_jt >= 1
bysort nace_d2 indicator_geq1: gegen m_alpha_jt = max(alpha_jt)
bysort nace_d2: gegen max_alpha_jt = min(m_alpha_jt)
replace alpha_jt = max_alpha_jt if alpha_jt >= 1 & alpha_jt != .
replace alpha_jt = 0.9999 if alpha_jt >= 1 & alpha_jt != . // intended to deal with a single obs in nace 4d industry
collapse (mean) alpha_jt, by(nace_d2)
tempfile alpha
save "`alpha'"
restore

fmerge m:1 nace_d2 using `alpha', nogen keep(1 3)
foreach var of varlist  tfas {
	gen log_TFPr_`var' = log_av - (alpha_j)*log_empl - (1-alpha_j)*log_`var'
	label variable log_TFPr_`var' "TFPr using `var' and 2-digit NACE labor shares"
	cap winsor log_TFPr_`var', gen(w1_log_TFPr_`var') p(0.01)
	cap  label variable w1_log_TFPr_`var' "Winsor 1%: TFPr using `var' and 2-digit NACE labor shares"
		}
 		quietly gen lvapw=log(av)-log(empl)
cap winsor lvapw, gen(w1_lvapw) p(0.01)
	cap  label variable w1_lvapw "Winsor 1%: lvapw"
		
levelsof cntrycde if _n == 2, local(cntrycde2)
disp `cntrycde2'


**Generate NUTS-level-specific data sets


foreach nuts in 2 3{
preserve
**Indices: S is region; J is industry 
**(T=1 is year but not used in cross sectional design)
quietly egen S =group(nuts`nuts')
quietly gen T=1
quietly rename nace_d2 J
quietly drop if S==.
quietly drop if J==.
*Need at least 2 obs for a cell (for measurement error correction)
bys S J T: gen Number=_N

** Save country-specific data sets
local xxx=`cntrycde2'
*YYY save only if dataset is non-empty after cuts
	if _N>0{
	drop if Number<2
	drop Number
    save "$BvD_input/Amadeus_Europe_2010_2019_coded_latest_NUTS`nuts'_`xxx'.dta", replace
	}
	restore
    }
	}
